Streaming TV Service
@YouTubeTV
@hulu
@Philo
@Sling
@AppleTV
@DisneyTVA
@ItsOnATT
@fuboTV
https://www.techhive.com/article/3211536/best-streaming-tv-service.html
Since the team wanted to achieve consistent results, the output was retrieved once and stored as csv files. Therefore, the commented code is the original code use to pull the data, with the import CSV lines of code used throughout the week.
import pandas as pd
from twitter_scraper import get_tweets
# %%time
# #YouTubeTV Hashtag Scrape
# YouTubeTV = [tweet['text'] for tweet in get_tweets('#YouTubeTV', pages=100)]
# YouTubeTVdf = pd.DataFrame(YouTubeTV)
# YouTubeTVdf.to_csv('C:/Users/1906g/Desktop/randomfolder/youtube.csv')
# %%time
# #Hulu Hashtag Scrape
# Hulu = [tweet['text'] for tweet in get_tweets('#hulu', pages=100)]
# Huludf = pd.DataFrame(Hulu)
# Huludf.to_csv('C:/Users/1906g/Desktop/randomfolder/Hulu.csv')
# %%time
# #Philo Hashtag Scrape
# Philo = [tweet['text'] for tweet in get_tweets('#Philo', pages=100)]
# Philodf = pd.DataFrame(Philo)
# Philodf.to_csv('C:/Users/1906g/Desktop/randomfolder/Philo.csv')
# %%time
# #Sling Hashtag Scrape
# Sling = [tweet['text'] for tweet in get_tweets('#Sling', pages=100)]
# Slingdf = pd.DataFrame(Sling)
# Slingdf.to_csv('C:/Users/1906g/Desktop/randomfolder/Sling.csv')
# %%time
# #AppleTV Hashtag Scrape
# AppleTV = [tweet['text'] for tweet in get_tweets('#AppleTV', pages=100)]
# AppleTVdf = pd.DataFrame(AppleTV)
# AppleTVdf.to_csv('C:/Users/1906g/Desktop/randomfolder/AppleTV.csv')
# %%time
# #DisneyTV Hashtag Scrape
# DisneyTV = [tweet['text'] for tweet in get_tweets('#DisneyTV', pages=100)]
# DisneyTVdf = pd.DataFrame(DisneyTV)
# DisneyTVdf.to_csv('C:/Users/1906g/Desktop/randomfolder/DisneyTV.csv')
# %%time
# #ItsOnATT Hashtag Scrape
# ItsOnATT = [tweet['text'] for tweet in get_tweets('#ItsOnATT', pages=100)]
# ItsOnATTdf = pd.DataFrame(ItsOnATT)
# ItsOnATTdf.to_csv('C:/Users/1906g/Desktop/randomfolder/ItsOnATT.csv')
# %%time
# #fuboTV Hashtag Scrape
# fuboTV = [tweet['text'] for tweet in get_tweets('#fuboTV', pages=100)]
# fuboTVdf = pd.DataFrame(fuboTV)
# fuboTVdf.to_csv('C:/Users/1906g/Desktop/randomfolder/fuboTV.csv')
# import CSV of data pulled once to maintain consistency
YouTubeTV = pd.read_csv('youtube.csv', header=None, skiprows = 1)[1].values.tolist()
Hulu = pd.read_csv('Hulu.csv', header=None, skiprows = 1)[1].values.tolist()
Philo = pd.read_csv('Philo.csv', header=None, skiprows = 1)[1].values.tolist()
Sling = pd.read_csv('Sling.csv', header=None, skiprows = 1)[1].values.tolist()
AppleTV = pd.read_csv('AppleTV.csv', header=None, skiprows = 1)[1].values.tolist()
DisneyTV = pd.read_csv('DisneyTV.csv', header=None, skiprows = 1)[1].values.tolist()
ItsOnATT = pd.read_csv('ItsOnATT.csv', header=None, skiprows = 1)[1].values.tolist()
fuboTV = pd.read_csv('fuboTV.csv', header=None, skiprows = 1)[1].values.tolist()
import nltk
nltk.download('stopwords')
# import spacy
import unicodedata
import re
from nltk.corpus import wordnet
import collections
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
def strip_html_tags(text):
soup = BeautifulSoup(text, "html.parser")
if bool(soup.find()):
[s.extract() for s in soup(['iframe', 'script'])]
stripped_text = soup.get_text()
stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
else:
stripped_text = text
return stripped_text
#def lemmatize_text(text):
# text = nlp(text)
# text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
# return text
def simple_porter_stemming(text):
ps = nltk.porter.PorterStemmer()
text = ' '.join([ps.stem(word) for word in text.split()])
return text
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-Z0-9\s]|\[|\]' if not remove_digits else r'[^a-zA-Z\s]|\[|\]'
text = re.sub(pattern, '', text)
return text
def remove_stopwords(text,stopwords=stopword_list):
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def normalize_corpus(corpus, html_stripping=True,
accented_char_removal=True, text_lower_case=True,
text_stemming=False, text_lemmatization=False,
special_char_removal=True, remove_digits=True,
stopword_removal=True, stopwords=stopword_list):
normalized_corpus = []
# normalize each document in the corpus
for doc in corpus:
# strip HTML
if html_stripping:
doc = strip_html_tags(doc)
# remove extra newlines
doc = doc.translate(doc.maketrans("\n\t\r", " "))
# remove accented characters
if accented_char_removal:
doc = remove_accented_chars(doc)
# lemmatize text
if text_lemmatization:
doc = lemmatize_text(doc)
# stem text
if text_stemming and not text_lemmatization:
doc = simple_porter_stemming(doc)
# remove special characters and\or digits
if special_char_removal:
# insert spaces between special characters to isolate them
special_char_pattern = re.compile(r'([{.(-)!}])')
doc = special_char_pattern.sub(" \\1 ", doc)
doc = remove_special_characters(doc, remove_digits=remove_digits)
# remove extra whitespace
doc = re.sub(' +', ' ', doc)
# lowercase the text
if text_lower_case:
doc = doc.lower()
# remove stopwords
if stopword_removal:
doc = remove_stopwords(doc,stopwords=stopwords)
# remove extra whitespace
doc = re.sub(' +', ' ', doc)
doc = doc.strip()
normalized_corpus.append(doc)
return normalized_corpus
#YouTube Corpus
corpus_YouTubeTV = normalize_corpus(YouTubeTV)
#Hulu Corpus
corpus_Hulu = normalize_corpus(Hulu)
#Philo Corpus
corpus_Philo = normalize_corpus(Philo)
#Sling Corpus
corpus_Sling = normalize_corpus(Sling)
#AppleTV Corpus
corpus_AppleTV = normalize_corpus(AppleTV)
#DisneyTV Corpus
corpus_DisneyTV = normalize_corpus(DisneyTV)
#ItsOnATT Corpus
corpus_ItsOnATT = normalize_corpus(ItsOnATT)
#fuboTV Corpus
corpus_fuboTV = normalize_corpus(fuboTV)
#YouTube TD-IDF
YouTubeTV_vec = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
use_idf=True, smooth_idf=True)
YouTubeTV_matrix = YouTubeTV_vec.fit_transform(corpus_YouTubeTV)
YouTubeTV_matrix = YouTubeTV_matrix.toarray()
vocab = YouTubeTV_vec.get_feature_names()
YouTubeTV_TFIDF = pd.DataFrame(np.round(YouTubeTV_matrix, 2), columns=vocab)
YouTubeTV_TFIDF.head(5)
#https://www.youtube.com/watch?v=WN18JksF9Cg
#Count Vectorizer Vs TF-IDF for Text Processing
#Hulu TD-IDF
Hulu_vec = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
use_idf=True, smooth_idf=True)
Hulu_matrix = Hulu_vec.fit_transform(corpus_Hulu)
Hulu_matrix = Hulu_matrix.toarray()
vocab = Hulu_vec.get_feature_names()
Hulu_TFIDF = pd.DataFrame(np.round(Hulu_matrix, 2), columns=vocab)
Hulu_TFIDF.head(5)
#Philo TD-IDF
Philo_vec = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
use_idf=True, smooth_idf=True)
Philo_matrix = Philo_vec.fit_transform(corpus_Philo)
Philo_matrix = Philo_matrix.toarray()
vocab = Philo_vec.get_feature_names()
Philo_TFIDF = pd.DataFrame(np.round(Philo_matrix, 2), columns=vocab)
Philo_TFIDF.head(5)
#Sling TD-IDF
Sling_vec = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
use_idf=True, smooth_idf=True)
Sling_matrix = Sling_vec.fit_transform(corpus_Sling)
Sling_matrix = Sling_matrix.toarray()
vocab = Sling_vec.get_feature_names()
Sling_TFIDF = pd.DataFrame(np.round(Sling_matrix, 2), columns=vocab)
Sling_TFIDF.head(5)
#AppleTV TD-IDF
AppleTV_vec = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
use_idf=True, smooth_idf=True)
AppleTV_matrix = AppleTV_vec.fit_transform(corpus_AppleTV)
AppleTV_matrix = AppleTV_matrix.toarray()
vocab = AppleTV_vec.get_feature_names()
AppleTV_TFIDF = pd.DataFrame(np.round(AppleTV_matrix, 2), columns=vocab)
AppleTV_TFIDF.head(5)
#DisneyTV TD-IDF
DisneyTV_vec = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
use_idf=True, smooth_idf=True)
DisneyTV_matrix = DisneyTV_vec.fit_transform(corpus_DisneyTV)
DisneyTV_matrix = DisneyTV_matrix.toarray()
vocab = DisneyTV_vec.get_feature_names()
DisneyTV_TFIDF = pd.DataFrame(np.round(DisneyTV_matrix, 2), columns=vocab)
DisneyTV_TFIDF.head(5)
#ItsOnATT TD-IDF
ItsOnATT_vec = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
use_idf=True, smooth_idf=True)
ItsOnATT_matrix = ItsOnATT_vec.fit_transform(corpus_ItsOnATT)
ItsOnATT_matrix = ItsOnATT_matrix.toarray()
vocab = ItsOnATT_vec.get_feature_names()
ItsOnATT_TFIDF = pd.DataFrame(np.round(ItsOnATT_matrix, 2), columns=vocab)
ItsOnATT_TFIDF.head(5)
#fuboTV TD-IDF
fuboTV_vec = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
use_idf=True, smooth_idf=True)
fuboTV_matrix = fuboTV_vec.fit_transform(corpus_fuboTV)
fuboTV_matrix = fuboTV_matrix.toarray()
vocab = fuboTV_vec.get_feature_names()
fuboTV_TFIDF = pd.DataFrame(np.round(fuboTV_matrix, 2), columns=vocab)
fuboTV_TFIDF.head(5)
import nltk
from nltk.corpus import webtext
from nltk.probability import FreqDist
from wordcloud import WordCloud
import matplotlib.pyplot as plt
def CreateWordCloud(STOPWORDS, corpus, tilte = 'title'):
raw_string = ''.join(corpus)
no_links = re.sub(r'http\S+', '', raw_string)
no_unicode = re.sub(r"\\[a-z][a-z]?[0-9]+", '', no_links)
no_special_characters = re.sub('[^A-Za-z ]+', '', no_unicode)
words = no_special_characters.split(" ")
words = [w for w in words if len(w) > 2] # ignore a, an, be, ...
words = [w.lower() for w in words]
words = [w for w in words if w not in STOPWORDS]
wc = WordCloud(width = 600, height = 400, background_color="white", max_words=2000)
clean_string = ','.join(words)
wc.generate(clean_string)
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wc)
plt.title(tilte, size=50)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
Please note, the interpretation of these word cloud groupings is purely subjective.
The YouTubeTV word cloud reveals some type of relationship with HBO Max.
After further research, there are news headlines that say HBO and HBO Max are headed for YouTubeTV. Clearly this created lots of buzz on twitter as revealed in the word cloud.
#YouTube Word Cloud
STOPWORDS = nltk.corpus.stopwords.words('english')
STOPWORDS = STOPWORDS + ['youtubetv', 'youtube', 'https', 'pictwittercom', 'pic']
CreateWordCloud(STOPWORDS,corpus_YouTubeTV,'YouTubeTV Cloud')
Hulu’s word cloud reveals season two of something. There also appears to be popular shows such as sonic, good doctor, and breaking bad. Sonic appears multiple times.
After further research, it appears that at some point, Hulu landed the streaming rights to The Good Doctor. It clearly must have been worth it because the Good Doctor continues to appear in the twitter conversation. There are several Sonics on Hulu. There is Sonic Boom and there is Sonic The Hedgehog. This may explain why sonic appeared several times in the word cloud.
#Hulu Word Cloud
STOPWORDS = nltk.corpus.stopwords.words('english')
STOPWORDS = STOPWORDS + ['hulu', 'https', 'pictwittercom', 'pic']
CreateWordCloud(STOPWORDS,corpus_Hulu,'Hulu Cloud')
Much of Philo’s word cloud appear to be non-English words. Interestingly, some of the main English words in the word cloud are language, communication and age.
After further research, it appears that Philo offers channels similar to the other streaming services, however they are focus on lifestyle channels. The non-English words are likely based on the time the tweets were pulled. There was likely a popular lifestyle show that generated buzz in a language different than English.
#Philo Word Cloud
STOPWORDS = nltk.corpus.stopwords.words('english')
STOPWORDS = STOPWORDS + ['https', 'philo', 'philosophie', 'philosophie', 'learnet', 'labelles', 'est', 'tout', 'du', 'pic']
CreateWordCloud(STOPWORDS,corpus_Philo,'Philo Cloud')
During the time that the Tweets were pulled for Sling TV, there clearly was buzz around something related to a baby. Most of the keywords were cute child, carrierbaby, waistbaby, safe baby, and just baby.
After further research, the #sling hash tag is not only related to Sling TV. The #sling hastag is apparently dominated by a baby sling product. Clearly, at the time the tweets were pulled, there was a lot of chatter from twitter community using the #sling hash tag to communicate about babies and baby slings.
#Sling Word Cloud
STOPWORDS = nltk.corpus.stopwords.words('english')
STOPWORDS = STOPWORDS + ['sling', 'pictwittercom', 'https', 'http', 'pic']
CreateWordCloud(STOPWORDS,corpus_Sling,'Sling Cloud')
For AppleTV, there were lots of non-English keywords. There were also other brands associated with AppleTV tweets. A few of those keywords were Amazon, Alexa, Samsung, in addition to Smart TV. Compatible also appeared. This may reveal people making comparisons between AppleTV and compatible products.
Apple being the worldwide company that it is has a translation app. This may explain the universality of the words pulled in by their associated tweets. Also, during the time the tweets were pulled, there may have been lots of buzz from different languages.
#AppleTV Word Cloud
STOPWORDS = nltk.corpus.stopwords.words('english')
STOPWORDS = STOPWORDS + ['appletv', 'pictwittercom', 'http', 'pic']
CreateWordCloud(STOPWORDS,corpus_AppleTV,'AppleTV Cloud')
DisneyTV’s word cloud has mostly a positive connotation. With words like great dominating the word cloud.
Being that Disney has many kid and teenage related programs, the word cloud fits their brand.
#DisneyTV Word Cloud
STOPWORDS = nltk.corpus.stopwords.words('english')
STOPWORDS = STOPWORDS + ['disneytv', 'disney', 'https', 'pictwittercom', 'bitly', 'pic']
CreateWordCloud(STOPWORDS,corpus_DisneyTV,'DisneyTV Cloud')
Other word clouds had several dominant keywords. However, ItsOnATT has a more balanced distribution of words associated with it.
Directtv, nbcuniversal, filmoffice, Netflix, richarbranson, elonmusk are some of the notable keywords. It reveals big name associations with AT&T.
#ItsOnATT Word Cloud
STOPWORDS = nltk.corpus.stopwords.words('english')
STOPWORDS = STOPWORDS + ['itsonatt', 'https', 'twittercom']
CreateWordCloud(STOPWORDS,corpus_ItsOnATT,'ItsOnATT Cloud')
The word cloud for fuboTV is interesting. It appears that there may be a very popular cooking show. The keywords that dominate the fuboTV word cloud are chocolate chip cookie related. For example, chewy, cookies, gooey, baked, chocolate, chip are scattered throughout the word cloud.
After researching the fuboTV hashtag, it is very sparse. If you do a little scrolling, you will quickly get to 2019 tweets. If you scroll a little more, you begin to see tweets from 2018. It was difficult to scroll back far enough to see cookie related tweets. Obviously the twitter scraper was able to go far enough back to identify this focus
#fuboTV Word Cloud
STOPWORDS = nltk.corpus.stopwords.words('english')
STOPWORDS = STOPWORDS + ['fubotv', 'https', 'pictwittercom', 'pic']
CreateWordCloud(STOPWORDS,corpus_fuboTV,'fuboTV Cloud')
#LDA conceptual breakdown.
#https://www.youtube.com/watch?v=DWJYZq_fQ2A
# LDA SCRIPT
# https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
Please note that each set of topics is, indeed, Brand specific within their individual 10-topic output. The specific brand terms were added as stop words and are not part of the visualization for the 10 most common words as well as the LDA topic outputs. We have interpreted the first 5 topics in each set of outputs and have noted the brand specific terms that are related to the specific streaming service if relevant.
Please note, the interpretation of these topics is purely subjective.
YouTubeTV_LDA = YouTubeTV
YouTubeTV_LDA = pd.DataFrame(YouTubeTV_LDA)
# Remove punctuation
YouTubeTV_LDA[0] = YouTubeTV_LDA[0].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
YouTubeTV_LDA[0] = YouTubeTV_LDA[0].map(lambda x: x.lower())
# Print out the first rows of papers
YouTubeTV_LDA.head()
# Join the different processed titles together.
long_string = ','.join(list(YouTubeTV_LDA[0].values))
# Create a WordCloud object
wordcloud = WordCloud(width = 600, height = 400, background_color="white", max_words=5000)
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Load the library with the CountVectorizer method
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
words = count_vectorizer.get_feature_names()
total_counts = np.zeros(len(words))
for t in count_data:
total_counts+=t.toarray()[0]
count_dict = (zip(words, total_counts))
count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
words = [w[0] for w in count_dict]
counts = [w[1] for w in count_dict]
x_pos = np.arange(len(words))
plt.figure(2, figsize=(15, 15/1.6180))
plt.subplot(title='10 most common words')
sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
sns.barplot(x_pos, counts, palette='husl')
plt.xticks(x_pos, words, rotation=90)
plt.xlabel('words')
plt.ylabel('counts')
plt.show()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words=STOPWORDS + ['youtubetv', 'youtube', 'https', 'pictwittercom'])
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(YouTubeTV_LDA[0])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
print("\nTopic #%d:" % topic_idx)
print(" ".join([words[i]
for i in topic.argsort()[:-n_top_words - 1:-1]]))
# Tweak the two parameters below
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
YouTubeTV Topics Interpretation:
This batch of topics is brand specific around YouTubeTV and TCM, Michael Jordan, and the SharkTank brand.
Hulu_LDA = Hulu
Hulu_LDA = pd.DataFrame(Hulu_LDA)
# Remove punctuation
Hulu_LDA[0] = Hulu_LDA[0].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
Hulu_LDA[0] = Hulu_LDA[0].map(lambda x: x.lower())
# Print out the first rows of papers
Hulu_LDA.head()
# Join the different processed titles together.
long_string = ','.join(list(Hulu_LDA[0].values))
# Create a WordCloud object
wordcloud = WordCloud(width = 600, height = 400, background_color="white", max_words=5000)
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words=STOPWORDS + ['hulu', 'https', 'pictwittercom',
'e3', '81', 'aa', '89', '82'])
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(Hulu_LDA[0])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)
# Tweak the two parameters below
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
Hulu Topics Interpretation:
This batch of topics is brand specific with association between Hulu and Netflix.
Philo_LDA = Philo
Philo_LDA = pd.DataFrame(Philo_LDA)
# Remove punctuation
Philo_LDA[0] = Philo_LDA[0].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
Philo_LDA[0] = Philo_LDA[0].map(lambda x: x.lower())
# Print out the first rows of papers
Philo_LDA.head()
# Join the different processed titles together.
long_string = ','.join(list(Philo_LDA[0].values))
# Create a WordCloud object
wordcloud = WordCloud(width = 600, height = 400, background_color="white", max_words=5000)
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words=STOPWORDS + ['https', 'philo', 'philosophie', 'philosophie', 'learnet', 'labelles', 'est', 'tout', 'du',
'de', 'le', 'la', 'et', 'les', 'en', 'vie', 'une', 'ce', 'ne'])
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(Philo_LDA[0])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)
# Tweak the two parameters below
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
Philo Topics Interpretation:
Sling_LDA = Sling
Sling_LDA = pd.DataFrame(Sling_LDA)
# Remove punctuation
Sling_LDA[0] = Sling_LDA[0].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
Sling_LDA[0] = Sling_LDA[0].map(lambda x: x.lower())
# Print out the first rows of papers
Sling_LDA.head()
# Join the different processed titles together.
long_string = ','.join(list(Sling_LDA[0].values))
# Create a WordCloud object
wordcloud = WordCloud(width = 600, height = 400, background_color="white", max_words=5000)
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words=STOPWORDS + ['sling', 'pictwittercom', 'https', 'http'])
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(Sling_LDA[0])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)
# Tweak the two parameters below
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
Sling Topics Interpretation:
Per our observation noted in the word cloud exercise, Sling TV shares the hashtag #sling with a baby sling product. This observation shows the limitation using hash tag research. The baby related sling tweets dominated the query.
As a result, only Topic #6 is related to Sling TV and it revolves around free sign up.
AppleTV_LDA = AppleTV
AppleTV_LDA = pd.DataFrame(AppleTV_LDA)
# Remove punctuation
AppleTV_LDA[0] = AppleTV_LDA[0].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
AppleTV_LDA[0] = AppleTV_LDA[0].map(lambda x: x.lower())
# Print out the first rows of papers
AppleTV_LDA.head()
# Join the different processed titles together.
long_string = ','.join(list(AppleTV_LDA[0].values))
# Create a WordCloud object
wordcloud = WordCloud(width = 600, height = 400, background_color="white", max_words=5000)
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words=STOPWORDS + ['appletv', 'pictwittercom', 'http', 'apple', 'de', 'en'])
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(AppleTV_LDA[0])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)
# Tweak the two parameters below
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
AppleTV Topics Interpretation:
This batch of topics is brand specific around AppleTV and the Amazon and Samsung brands.
DisneyTV_LDA = DisneyTV
DisneyTV_LDA = pd.DataFrame(DisneyTV_LDA)
# Remove punctuation
DisneyTV_LDA[0] = DisneyTV_LDA[0].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
DisneyTV_LDA[0] = DisneyTV_LDA[0].map(lambda x: x.lower())
# Print out the first rows of papers
DisneyTV_LDA.head()
# Join the different processed titles together.
long_string = ','.join(list(DisneyTV_LDA[0].values))
# Create a WordCloud object
wordcloud = WordCloud(width = 600, height = 400, background_color="white", max_words=5000)
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words=STOPWORDS + ['disneytv', 'disney', 'https', 'pictwittercom', 'bitly'])
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(DisneyTV_LDA[0])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)
# Tweak the two parameters below
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
DisneyTV Topics Interpretation:
ItsOnATT_LDA = ItsOnATT
ItsOnATT_LDA = pd.DataFrame(ItsOnATT_LDA)
# Remove punctuation
ItsOnATT_LDA[0] = ItsOnATT_LDA[0].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
ItsOnATT_LDA[0] = ItsOnATT_LDA[0].map(lambda x: x.lower())
# Print out the first rows of papers
ItsOnATT_LDA.head()
# Join the different processed titles together.
long_string = ','.join(list(ItsOnATT_LDA[0].values))
# Create a WordCloud object
wordcloud = WordCloud(width = 600, height = 400, background_color="white", max_words=5000)
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words=STOPWORDS + ['itsonatt', 'https', 'twittercom'])
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(ItsOnATT_LDA[0])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)
# Tweak the two parameters below
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
ItsOnATT Topics Interpretation:
AT&T is very brand specific and is associated with many other brands, such as NBC Universal, New Mexico Films, Direct TV, Netflix, the NFL, and many more. ItsOnATT is clearly the most brand associated of all the streaming services.
fuboTV_LDA = fuboTV
fuboTV_LDA = pd.DataFrame(fuboTV_LDA)
# Remove punctuation
fuboTV_LDA[0] = fuboTV_LDA[0].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
fuboTV_LDA[0] = fuboTV_LDA[0].map(lambda x: x.lower())
# Print out the first rows of papers
fuboTV_LDA.head()
# Join the different processed titles together.
long_string = ','.join(list(fuboTV_LDA[0].values))
# Create a WordCloud object
wordcloud = WordCloud(width = 600, height = 400, background_color="white", max_words=5000)
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words=STOPWORDS + ['fubotv', 'https', 'pictwittercom'])
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(fuboTV_LDA[0])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)
# Tweak the two parameters below
number_topics = 10
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
fuboTV Topics Interpretation:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from collections import Counter
Please note that each streaming service has it’s own elbow chart. The instructions were to extract between 3-5 clusters, however, there was no clear elbow between 3 – 5 clusters except for AppleTV, and fuboTV. Since AppleTV had its elbow at 4 clusters, we decided to implement a 4 cluster solution for each streaming service.
Please note, the interpretation of these clusters is purely subjective.
# we always assume the max number of cluster would be 10
# you can judge the number of clusters by doing averaging
# method to visualize max no of clusters
def CreateElbowChart(input_to_fit):
wcss = []
for i in range(1,11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state =0)
kmeans.fit(input_to_fit)
wcss.append(kmeans.inertia_)
plt.plot(range(1,11), wcss)
plt.title('The Elbow Method')
plt.xlabel('no of clusters')
plt.ylabel('wcss')
plt.show
CreateElbowChart(YouTubeTV_TFIDF)
stop_words = nltk.corpus.stopwords.words('english') + ['youtubetv', 'youtube', 'https', 'pictwittercom', 'pic']
#cv = CountVectorizer(ngram_range=(1,2),min_df=10, max_df=0.8, stop_words=stop_words)
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus_YouTubeTV)
# cv_matrix.shape
NUM_CLUSTERS = 4
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
km
# Counter(km.labels_)
corpus_YouTubeTV_df = pd.DataFrame({'Document': corpus_YouTubeTV})
corpus_YouTubeTV_df['kmeans_cluster'] = km.labels_
corpus_YouTubeTV_df
YouTubeClusters = corpus_YouTubeTV_df.groupby('kmeans_cluster').head(20)
YouTubeClusters = YouTubeClusters.copy(deep=True)
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
for cluster_num in range(NUM_CLUSTERS):
key_features = [feature_names[index]
for index in ordered_centroids[cluster_num, :topn_features]]
testing = YouTubeClusters[YouTubeClusters['kmeans_cluster'] == cluster_num].values.tolist()
print('CLUSTER #'+str(cluster_num+1))
print('Key Features:', key_features)
YouTubeTV Cluster Interpretation:
CreateElbowChart(Hulu_TFIDF)
stop_words = nltk.corpus.stopwords.words('english') + ['hulu', 'https', 'pictwittercom', 'pic', 'e3', '81', 'aa', '89', '82']
#cv = CountVectorizer(ngram_range=(1,2),min_df=10, max_df=0.8, stop_words=stop_words)
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus_Hulu)
# cv_matrix.shape
NUM_CLUSTERS = 4
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
km
# Counter(km.labels_)
corpus_Hulu_df = pd.DataFrame({'Document': corpus_Hulu})
corpus_Hulu_df['kmeans_cluster'] = km.labels_
corpus_Hulu_df
HuluClusters = corpus_Hulu_df.groupby('kmeans_cluster').head(20)
HuluClusters = HuluClusters.copy(deep=True)
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
for cluster_num in range(NUM_CLUSTERS):
key_features = [feature_names[index]
for index in ordered_centroids[cluster_num, :topn_features]]
testing = HuluClusters[HuluClusters['kmeans_cluster'] == cluster_num].values.tolist()
print('CLUSTER #'+str(cluster_num+1))
print('Key Features:', key_features)
Hulu Cluster Interpretation:
CreateElbowChart(Philo_TFIDF)
stop_words = nltk.corpus.stopwords.words('english') + ['https', 'philo', 'philosophie', 'philosophie', 'learnet', 'labelles', 'est', 'tout', 'du', 'pic', 'de', 'le', 'la', 'et', 'les', 'en', 'vie', 'une', 'ce', 'ne']
#cv = CountVectorizer(ngram_range=(1,2),min_df=10, max_df=0.8, stop_words=stop_words)
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus_Philo)
# cv_matrix.shape
NUM_CLUSTERS = 4
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
km
# Counter(km.labels_)
corpus_Philo_df = pd.DataFrame({'Document': corpus_Philo})
corpus_Philo_df['kmeans_cluster'] = km.labels_
corpus_Philo_df
PhiloClusters = corpus_Philo_df.groupby('kmeans_cluster').head(20)
PhiloClusters = PhiloClusters.copy(deep=True)
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
for cluster_num in range(NUM_CLUSTERS):
key_features = [feature_names[index]
for index in ordered_centroids[cluster_num, :topn_features]]
testing = PhiloClusters[PhiloClusters['kmeans_cluster'] == cluster_num].values.tolist()
print('CLUSTER #'+str(cluster_num+1))
print('Key Features:', key_features)
Philo Clusters:
CreateElbowChart(Sling_TFIDF)
stop_words = nltk.corpus.stopwords.words('english') + ['sling', 'pictwittercom', 'https', 'http', 'pic']
#cv = CountVectorizer(ngram_range=(1,2),min_df=10, max_df=0.8, stop_words=stop_words)
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus_Sling)
# cv_matrix.shape
NUM_CLUSTERS = 4
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
km
# Counter(km.labels_)
corpus_Sling_df = pd.DataFrame({'Document': corpus_Sling})
corpus_Sling_df['kmeans_cluster'] = km.labels_
corpus_Sling_df
SlingClusters = corpus_Sling_df.groupby('kmeans_cluster').head(20)
SlingClusters = SlingClusters.copy(deep=True)
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
for cluster_num in range(NUM_CLUSTERS):
key_features = [feature_names[index]
for index in ordered_centroids[cluster_num, :topn_features]]
testing = SlingClusters[SlingClusters['kmeans_cluster'] == cluster_num].values.tolist()
print('CLUSTER #'+str(cluster_num+1))
print('Key Features:', key_features)
Sling Cluster Interpretation:
CreateElbowChart(AppleTV_TFIDF)
stop_words = nltk.corpus.stopwords.words('english') + ['appletv', 'pictwittercom', 'http', 'pic', 'apple', 'de', 'en']
#cv = CountVectorizer(ngram_range=(1,2),min_df=10, max_df=0.8, stop_words=stop_words)
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus_AppleTV)
# cv_matrix.shape
NUM_CLUSTERS = 4
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
km
# Counter(km.labels_)
corpus_AppleTV_df = pd.DataFrame({'Document': corpus_AppleTV})
corpus_AppleTV_df['kmeans_cluster'] = km.labels_
corpus_AppleTV_df
AppleTVClusters = corpus_AppleTV_df.groupby('kmeans_cluster').head(20)
AppleTVClusters = AppleTVClusters.copy(deep=True)
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
for cluster_num in range(NUM_CLUSTERS):
key_features = [feature_names[index]
for index in ordered_centroids[cluster_num, :topn_features]]
testing = AppleTVClusters[AppleTVClusters['kmeans_cluster'] == cluster_num].values.tolist()
print('CLUSTER #'+str(cluster_num+1))
print('Key Features:', key_features)
AppleTV Cluster Interpretation:
CreateElbowChart(DisneyTV_TFIDF)
stop_words = nltk.corpus.stopwords.words('english') + ['disneytv', 'disney', 'https', 'pictwittercom', 'bitly', 'pic']
#cv = CountVectorizer(ngram_range=(1,2),min_df=10, max_df=0.8, stop_words=stop_words)
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus_DisneyTV)
# cv_matrix.shape
NUM_CLUSTERS = 4
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
km
# Counter(km.labels_)
corpus_DisneyTV_df = pd.DataFrame({'Document': corpus_DisneyTV})
corpus_DisneyTV_df['kmeans_cluster'] = km.labels_
corpus_DisneyTV_df
DisneyTVClusters = corpus_DisneyTV_df.groupby('kmeans_cluster').head(20)
DisneyTVClusters = DisneyTVClusters.copy(deep=True)
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
for cluster_num in range(NUM_CLUSTERS):
key_features = [feature_names[index]
for index in ordered_centroids[cluster_num, :topn_features]]
testing = DisneyTVClusters[DisneyTVClusters['kmeans_cluster'] == cluster_num].values.tolist()
print('CLUSTER #'+str(cluster_num+1))
print('Key Features:', key_features)
DisneyTV Cluster Interpretation:
CreateElbowChart(ItsOnATT_TFIDF)
stop_words = nltk.corpus.stopwords.words('english') + ['itsonatt', 'https', 'twittercom']
#cv = CountVectorizer(ngram_range=(1,2),min_df=10, max_df=0.8, stop_words=stop_words)
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus_ItsOnATT)
# cv_matrix.shape
NUM_CLUSTERS = 4
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
km
# Counter(km.labels_)
corpus_ItsOnATT_df = pd.DataFrame({'Document': corpus_ItsOnATT})
corpus_ItsOnATT_df['kmeans_cluster'] = km.labels_
corpus_ItsOnATT_df
ItsOnATTClusters = corpus_ItsOnATT_df.groupby('kmeans_cluster').head(20)
ItsOnATTClusters = ItsOnATTClusters.copy(deep=True)
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
for cluster_num in range(NUM_CLUSTERS):
key_features = [feature_names[index]
for index in ordered_centroids[cluster_num, :topn_features]]
testing = ItsOnATTClusters[ItsOnATTClusters['kmeans_cluster'] == cluster_num].values.tolist()
print('CLUSTER #'+str(cluster_num+1))
print('Key Features:', key_features)
ItsOnATT Cluster Interpretation:
CreateElbowChart(fuboTV_TFIDF)
stop_words = nltk.corpus.stopwords.words('english') + ['fubotv', 'https', 'pictwittercom', 'pic']
#cv = CountVectorizer(ngram_range=(1,2),min_df=10, max_df=0.8, stop_words=stop_words)
cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(corpus_fuboTV)
# cv_matrix.shape
NUM_CLUSTERS = 4
km = KMeans(n_clusters=NUM_CLUSTERS, max_iter=10000, n_init=50, random_state=42).fit(cv_matrix)
km
# Counter(km.labels_)
corpus_fuboTV_df = pd.DataFrame({'Document': corpus_fuboTV})
corpus_fuboTV_df['kmeans_cluster'] = km.labels_
corpus_fuboTV_df
fuboTVClusters = corpus_fuboTV_df.groupby('kmeans_cluster').head(20)
fuboTVClusters = fuboTVClusters.copy(deep=True)
feature_names = cv.get_feature_names()
topn_features = 15
ordered_centroids = km.cluster_centers_.argsort()[:, ::-1]
for cluster_num in range(NUM_CLUSTERS):
key_features = [feature_names[index]
for index in ordered_centroids[cluster_num, :topn_features]]
testing = fuboTVClusters[fuboTVClusters['kmeans_cluster'] == cluster_num].values.tolist()
print('CLUSTER #'+str(cluster_num+1))
print('Key Features:', key_features)
fuboTV Clusters:
Limitations & delighters and disappointers
import numpy as np
import pandas as pd
import re
import time
import math
import re
from textblob import TextBlob
import pandas as pd
import nltk
import nltk as nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
import string
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
def sentiment_parser(x):
if x['compound'] <= -0.05:
return 'negative'
elif x['compound'] >= 0.05:
return 'positive'
else:
return 'neutral'
YouTubeTVdf_t = pd.DataFrame({'Document': YouTubeTV})
Huludf_t = pd.DataFrame({'Document': Hulu})
Philodf_t = pd.DataFrame({'Document': Philo})
Slingdf_t = pd.DataFrame({'Document': Sling})
AppleTVdf_t = pd.DataFrame({'Document': AppleTV})
DisneyTVdf_t = pd.DataFrame({'Document': DisneyTV})
ItsOnATTdf_t = pd.DataFrame({'Document': ItsOnATT})
fuboTVdf_t = pd.DataFrame({'Document': fuboTV})
YouTubeTVdf_t['text_clean'] = YouTubeTVdf_t['Document'].map(lambda x: re.sub('[^a-zA-Z0-9 . , : - _]', '', str(x)))
YouTubeTVdf_t.text_clean = YouTubeTVdf_t.text_clean.str.lower()
YouTubeTVdf_t['vader_sentiment_test'] = YouTubeTVdf_t.text_clean.apply(lambda x:analyser.polarity_scores(x))
YouTubeTVdf_t['streamer_sentiment'] = YouTubeTVdf_t['vader_sentiment_test'].apply(lambda x:sentiment_parser(x))
Huludf_t['text_clean'] = Huludf_t['Document'].map(lambda x: re.sub('[^a-zA-Z0-9 . , : - _]', '', str(x)))
Huludf_t.text_clean = Huludf_t.text_clean.str.lower()
Huludf_t['vader_sentiment_test'] = Huludf_t.text_clean.apply(lambda x:analyser.polarity_scores(x))
Huludf_t['streamer_sentiment'] = Huludf_t['vader_sentiment_test'].apply(lambda x:sentiment_parser(x))
Philodf_t['text_clean'] = Philodf_t['Document'].map(lambda x: re.sub('[^a-zA-Z0-9 . , : - _]', '', str(x)))
Philodf_t.text_clean = Philodf_t.text_clean.str.lower()
Philodf_t['vader_sentiment_test'] = Philodf_t.text_clean.apply(lambda x:analyser.polarity_scores(x))
Philodf_t['streamer_sentiment'] = Philodf_t['vader_sentiment_test'].apply(lambda x:sentiment_parser(x))
Slingdf_t['text_clean'] = Slingdf_t['Document'].map(lambda x: re.sub('[^a-zA-Z0-9 . , : - _]', '', str(x)))
Slingdf_t.text_clean = Slingdf_t.text_clean.str.lower()
Slingdf_t['vader_sentiment_test'] = Slingdf_t.text_clean.apply(lambda x:analyser.polarity_scores(x))
Slingdf_t['streamer_sentiment'] = Slingdf_t['vader_sentiment_test'].apply(lambda x:sentiment_parser(x))
AppleTVdf_t['text_clean'] = AppleTVdf_t['Document'].map(lambda x: re.sub('[^a-zA-Z0-9 . , : - _]', '', str(x)))
AppleTVdf_t.text_clean = AppleTVdf_t.text_clean.str.lower()
AppleTVdf_t['vader_sentiment_test'] = AppleTVdf_t.text_clean.apply(lambda x:analyser.polarity_scores(x))
AppleTVdf_t['streamer_sentiment'] = AppleTVdf_t['vader_sentiment_test'].apply(lambda x:sentiment_parser(x))
DisneyTVdf_t['text_clean'] = DisneyTVdf_t['Document'].map(lambda x: re.sub('[^a-zA-Z0-9 . , : - _]', '', str(x)))
DisneyTVdf_t.text_clean = DisneyTVdf_t.text_clean.str.lower()
DisneyTVdf_t['vader_sentiment_test'] = DisneyTVdf_t.text_clean.apply(lambda x:analyser.polarity_scores(x))
DisneyTVdf_t['streamer_sentiment'] = DisneyTVdf_t['vader_sentiment_test'].apply(lambda x:sentiment_parser(x))
ItsOnATTdf_t['text_clean'] = ItsOnATTdf_t['Document'].map(lambda x: re.sub('[^a-zA-Z0-9 . , : - _]', '', str(x)))
ItsOnATTdf_t.text_clean = ItsOnATTdf_t.text_clean.str.lower()
ItsOnATTdf_t['vader_sentiment_test'] = ItsOnATTdf_t.text_clean.apply(lambda x:analyser.polarity_scores(x))
ItsOnATTdf_t['streamer_sentiment'] = ItsOnATTdf_t['vader_sentiment_test'].apply(lambda x:sentiment_parser(x))
fuboTVdf_t['text_clean'] = fuboTVdf_t['Document'].map(lambda x: re.sub('[^a-zA-Z0-9 . , : - _]', '', str(x)))
fuboTVdf_t.text_clean = fuboTVdf_t.text_clean.str.lower()
fuboTVdf_t['vader_sentiment_test'] = fuboTVdf_t.text_clean.apply(lambda x:analyser.polarity_scores(x))
fuboTVdf_t['streamer_sentiment'] = fuboTVdf_t['vader_sentiment_test'].apply(lambda x:sentiment_parser(x))
YouTubeTVdf_t = YouTubeTVdf_t[['Document', 'text_clean', 'streamer_sentiment']]
Huludf_t = Huludf_t[['Document', 'text_clean', 'streamer_sentiment']]
Philodf_t = Philodf_t[['Document', 'text_clean', 'streamer_sentiment']]
Slingdf_t = Slingdf_t[['Document', 'text_clean', 'streamer_sentiment']]
AppleTVdf_t = AppleTVdf_t[['Document', 'text_clean', 'streamer_sentiment']]
DisneyTVdf_t = DisneyTVdf_t[['Document', 'text_clean', 'streamer_sentiment']]
ItsOnATTdf_t = ItsOnATTdf_t[['Document', 'text_clean', 'streamer_sentiment']]
fuboTVdf_t = fuboTVdf_t[['Document', 'text_clean', 'streamer_sentiment']]
YouTubeTVdf_t['streamer'] = 'YouTubeTV'
Huludf_t['streamer'] = 'Hulu'
Philodf_t['streamer'] = 'Philo'
Slingdf_t['streamer'] = 'Sling'
AppleTVdf_t['streamer'] = 'AppleTV'
DisneyTVdf_t['streamer'] = 'DisneyTV'
ItsOnATTdf_t['streamer'] = 'ItsOnATT'
fuboTVdf_t['streamer'] = 'fuboTV'
frames = [YouTubeTVdf_t, Huludf_t, Philodf_t, Slingdf_t, AppleTVdf_t, DisneyTVdf_t, ItsOnATTdf_t, fuboTVdf_t]
df = pd.concat(frames)
doc_complete = list(df.text_clean)
stop_streamer_names = ['youtubetv','hulu','philo','sling', 'appletv','disneytv','itsonatt','fubotv']
stop = set(stopwords.words('english')).union(stop_streamer_names)
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
return normalized
doc_clean = [clean(doc).split() for doc in doc_complete]
final_list = []
for i in doc_clean:
initial_list = []
for j in i:
if j not in stop_streamer_names:
initial_list.append(j)
final_list.append(initial_list)
doc_clean = final_list
# Creating the term dictionary of our courpus, where every unique term is assigned an index.
dictionary = corpora.Dictionary(doc_clean)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel
%%time
ldamodel = LdaMulticore(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50) #3 topics
print(*ldamodel.print_topics(num_topics=3, num_words=10), sep='\n')
%%time
ldamodel = LdaMulticore(doc_term_matrix, num_topics=8, id2word = dictionary, passes=50) #3 topics
print(*ldamodel.print_topics(num_topics=8, num_words=15), sep='\n')
live_events = df[df.text_clean.str.contains('live|news|channel|next|game|missed|team')]
live_events = live_events[~ live_events.text_clean.str.contains('episode|movie|original|show|film|artiste|channel')]
percent_calc = lambda y:y.sum()/y.count()
live_event_counts = live_events.groupby(['streamer','streamer_sentiment']).agg({'Document':['count']})
live_event_counts
live_event_counts.groupby(level=0).apply(lambda x:x / float(x.sum()))
cust_percent = live_event_counts.groupby(level=0).apply(lambda x:x / float(x.sum()))
names = cust_percent.index.get_level_values(0)
values = np.square(cust_percent.values)
names
print('Final Scores:')
print(names[0])
print(1-np.sum(values[0:3]))
print(names[3])
print(1-np.sum(values[3:4]))
print(names[4])
print(1-np.sum(values[4:6]))
print(names[6])
print(1-np.sum(values[6:7]))
print(names[7])
print(1-np.sum(values[7:8]))
print(names[8])
print(1-np.sum(values[8:11]))
print(names[11])
print(1-np.sum(values[11:]))
# No sling
live_event_scores = [1-np.sum(values[0:3]), 1-np.sum(values[3:4]), 1-np.sum(values[4:6]),
1-np.sum(values[6:7]), 1-np.sum(values[7:8]), np.nan,
1-np.sum(values[8:11]),1-np.sum(values[11:])]
live_event_scores
original_programming = df[df.text_clean.str.contains('episode|movie|original|show|film|artiste|channel')]
original_programming = original_programming[~ original_programming.text_clean.str.contains('live|news|channel|next|game|missed|team')]
original_programming_counts = original_programming.groupby(['streamer','streamer_sentiment']).agg({'Document':['count']})
original_programming_counts
cancel_percent = original_programming_counts.groupby(level=0).apply(lambda x:x / float(x.sum()))
cancel_percent
names = cancel_percent.index.get_level_values(0)
values = np.square(cancel_percent.values)
names
values
print('Final Scores:')
print(names[0])
print(1-np.sum(values[0:3]))
print(names[3])
print(1-np.sum(values[3:6]))
print(names[6])
print(1-np.sum(values[6:7]))
print(names[7])
print(1-np.sum(values[7:10]))
print(names[10])
print(1-np.sum(values[10:]))
# no Sling, YouTubeTV, and fudoTV
original_programming_scores = [1-np.sum(values[0:3]), 1-np.sum(values[3:6]), 1-np.sum(values[6:7]),
1-np.sum(values[7:10]), 1-np.sum(values[10]), np.nan, np.nan, np.nan]
original_programming_scores
overall_service = df[df.text_clean.str.contains('book|great|like|sister|communication|baby|citation|streaming|still|pense|descartes|chose|plus|tv|view|watch|lineup|dvr|cordcutter|cordcutters|review|subscriber|available')]
overall_service = overall_service[~ overall_service.text_clean.str.contains('live|news|channel|next|game|missed|team|episode|movie|original|show|film|artiste|channel')]
overall_service_counts = overall_service.groupby(['streamer','streamer_sentiment']).agg({'Document':['count']})
overall_service_counts
overall_percent = overall_service_counts.groupby(level=0).apply(lambda x:x / float(x.sum()))
overall_percent
names = overall_percent.index.get_level_values(0)
values = np.square(overall_percent.values)
names
print('Final Scores:')
print(names[0])
print(1-np.sum(values[0:2]))
print(names[2])
print(1-np.sum(values[2:4]))
print(names[4])
print(1-np.sum(values[4:7]))
print(names[7])
print(1-np.sum(values[7:9]))
print(names[9])
print(1-np.sum(values[9:11]))
print(names[11])
print(1-np.sum(values[11:13]))
print(names[13])
print(1-np.sum(values[13:15]))
print(names[15])
print(1-np.sum(values[15:]))
overall_service_scores = [1-np.sum(values[0:2]), 1-np.sum(values[2:4]), 1-np.sum(values[4:7]),
1-np.sum(values[7:9]), 1-np.sum(values[9:11]), 1-np.sum(values[11:13]),
1-np.sum(values[13:15]), 1-np.sum(values[15:])]
streamers = ['AppleTV', 'DisneyTV', 'Hulu', 'ItsOnATT', 'Philo', 'Sling', 'YouTubeTV', 'fuboTV']
live_event_scores = pd.DataFrame(live_event_scores)
original_programming_scores = pd.DataFrame(original_programming_scores)
overall_service_scores = pd.DataFrame(overall_service_scores)
final_scores = pd.concat([live_event_scores,original_programming_scores,overall_service_scores],axis=1)
final_scores
final_scores.columns = ['Live Events', 'Original Programing', 'Overall Streamers']
final_scores.index = streamers
final_scores = round(final_scores*10,2)
final_scores
from sklearn.manifold import MDS
# due to the NA values for some of the dimensions had to drop those streaming services
final_scores = final_scores.dropna()
embedding = MDS(n_components=2,random_state=2019)
scores_transformed = embedding.fit_transform(final_scores)
scores_transformed.shape
import matplotlib.pyplot as plt
from matplotlib.offsetbox import (TextArea, DrawingArea, OffsetImage,
AnnotationBbox)
from matplotlib.cbook import get_sample_data
fig, ax = plt.subplots(figsize=(20, 10))
ax.scatter(scores_transformed[:,0],scores_transformed[:,1])
for i, txt in enumerate(list(final_scores.index)):
xy = [scores_transformed[i,0],scores_transformed[i,1]]
if txt == 'Hulu':
fn = get_sample_data("Hulu.png", asfileobj=False)
arr_img = plt.imread(fn, format='png')
imagebox = OffsetImage(arr_img, zoom=1.0)
imagebox.image.axes = ax
ab = AnnotationBbox(imagebox,xy,
xybox=(-100., -20.),
xycoords='data',
boxcoords="offset points",
pad=0.5,
)
ax.add_artist(ab)
elif txt == 'DisneyTV':
fn = get_sample_data("DisneyTV.png", asfileobj=False)
arr_img = plt.imread(fn, format='png')
imagebox = OffsetImage(arr_img, zoom=1.0)
imagebox.image.axes = ax
ab = AnnotationBbox(imagebox,xy,
xybox=(100., -20.),
xycoords='data',
boxcoords="offset points",
pad=0.5,
)
ax.add_artist(ab)
elif txt == 'Philo':
fn = get_sample_data("Philo.png", asfileobj=False)
arr_img = plt.imread(fn, format='png')
imagebox = OffsetImage(arr_img, zoom=1.0)
imagebox.image.axes = ax
ab = AnnotationBbox(imagebox,xy,
xybox=(-100., 50.),
xycoords='data',
boxcoords="offset points",
pad=0.5,
)
ax.add_artist(ab)
elif txt == 'AppleTV':
fn = get_sample_data("AppleTV.png", asfileobj=False)
arr_img = plt.imread(fn, format='png')
imagebox = OffsetImage(arr_img, zoom=1.0)
imagebox.image.axes = ax
ab = AnnotationBbox(imagebox,xy,
xybox=(-100., -20.),
xycoords='data',
boxcoords="offset points",
pad=0.5,
)
ax.add_artist(ab)
elif txt == 'ItsOnATT':
fn = get_sample_data("ItsOnATT.png", asfileobj=False)
arr_img = plt.imread(fn, format='png')
imagebox = OffsetImage(arr_img, zoom=1.0)
imagebox.image.axes = ax
ab = AnnotationBbox(imagebox,xy,
xybox=(100., 70.),
xycoords='data',
boxcoords="offset points",
pad=0.5,
)
ax.add_artist(ab)
plt.xlabel("X axis, Original Programming")
plt.ylabel("Y axis, Live Events")
plt.show()
Sampling limitations
Product positioning maps & brand health
Approach relied upon to product delighters and disappointers?
Result reliability